import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, accuracy_score
from scipy.sparse import hstack
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pickle
import tqdm
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('preprocessed_data.csv', nrows=50000)
data.shape
y = data.project_is_approved
X = data.drop(["project_is_approved"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.33, stratify=y)
def convert_text_into_TFIDF(feature):
'''This function transforms a feature into TFIDF vector'''
vectorizer = TfidfVectorizer(min_df = 10, ngram_range=(1,4), max_features=5000)
vectorizer.fit(X_train[feature].values)
X_train_ohe = vectorizer.transform(X_train[feature].values)
X_test_ohe = vectorizer.transform(X_test[feature].values)
return vectorizer, X_train_ohe, X_test_ohe
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
def calculate_TFIDF_Weighted_W2V(feature_value, dictionary, tfidf_words):
tfidf_w2v_vectors = []
for sentence in feature_value:
vector = np.zeros(300)
tf_idf_weight = 0
for word in sentence.split():
if(word in tfidf_words) and (word in glove_words):
vec = model[word]
tf_idf = dictionary[word] * (sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf)
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors.append(vector)
return tfidf_w2v_vectors
def convert_Text_Into_TFIDF_Weighted_W2V(feature):
tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train[feature].values)
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
X_train_tfidf_w2v = calculate_TFIDF_Weighted_W2V(X_train[feature].values, dictionary, tfidf_words)
X_test_tfidf_w2v = calculate_TFIDF_Weighted_W2V(X_test[feature].values, dictionary, tfidf_words)
return tfidf_model, X_train_tfidf_w2v, X_test_tfidf_w2v
def get_response_code(X_train, y_train):
df = pd.DataFrame({ 'X_train': X_train, 'y_train' : y_train })
categories = df.X_train.unique()
lst_response_code = []
for category in categories:
probablity_0 = 0
probablity_1 = 0
df_category = df[df.X_train == category]
for i in range(len(df_category)):
if df_category.X_train.values[i] == category:
if df_category.y_train.values[i] == 0:
probablity_0 += 1
else:
probablity_1 += 1
response_code = {'category' : category,
'category_0' : probablity_0,
'category_1' : probablity_1,
'category_tot': probablity_0+ probablity_1 }
lst_response_code.append(response_code)
return pd.DataFrame(lst_response_code)
def encode_categorical_with_response_code(X, df_response_code):
lst_category_0_1 = []
for data in X:
df_res = df_response_code[df_response_code.category == data]
if len(df_res) != 0:
lst_category_0_1.append([df_res.category_0.values[0] / df_res.category_tot.values[0],
df_res.category_1.values[0] / df_res.category_tot.values[0]])
else:
lst_category_0_1.append([0.5, 0.5])
return lst_category_0_1
def convert_categorical_Into_1_and_0(feature):
df_response_code = get_response_code(X_train[feature], y_train)
X_train_0_1 = encode_categorical_with_response_code(X_train[feature].values, df_response_code)
X_test_0_1 = encode_categorical_with_response_code(X_test[feature].values, df_response_code)
return df_response_code, X_train_0_1, X_test_0_1
def encode_numerical_features(feature):
normalizer = Normalizer()
normalizer.fit(X_train[feature].values.reshape(1,-1))
X_train_ohe = normalizer.transform(X_train[feature].values.reshape(1,-1))
X_test_ohe = normalizer.transform(X_test[feature].values.reshape(1,-1))
return normalizer, X_train_ohe.reshape(-1,1), X_test_ohe.reshape(-1,1)
def get_essay_sentiment_score(X_essay):
sid = SentimentIntensityAnalyzer()
lst_sentiments = []
for data in X_essay:
ss = sid.polarity_scores(data)
lst_sentiments.append([ss['neg'], ss['neu'], ss['pos'], ss['compound']])
return lst_sentiments
'''
#Categorical
X_train['school_state']
X_train['teacher_prefix']
X_train['project_grade_category']
X_train['clean_categories']
X_train['clean_subcategories']
#numerical
X_train['teacher_number_of_previously_posted_projects']
X_train['price']
#essay
X_train['essay']'''
#Performing Response code encoding for School State
df_rspnse_code_state, X_train_state_0_1, X_test_state_0_1 \
= convert_categorical_Into_1_and_0('school_state')
lst_state_features = df_rspnse_code_state.category.values
#Performing Response code encoding for tech_prefix
df_rspnse_code_tchr_prfx, X_train_tchr_prfx_0_1, X_test_tchr_prfx_0_1 \
= convert_categorical_Into_1_and_0('teacher_prefix')
lst_tchr_prfx_features = df_rspnse_code_tchr_prfx.category.values
#Performing Response code encoding for grade_category
df_rspnse_code_grade, X_train_grade_0_1, X_test_grade_0_1 \
= convert_categorical_Into_1_and_0('project_grade_category')
lst_grade_features = df_rspnse_code_grade.category.values
#Performing Response code encoding for categories
df_rspnse_code_categories, X_train_categories_0_1, X_test_categories_0_1 \
= convert_categorical_Into_1_and_0('clean_categories')
lst_categories_features = df_rspnse_code_categories.category.values
#Performing Response code encoding for subcategories
df_rspnse_code_subcategories, X_train_subcategories_0_1, X_test_subcategories_0_1 \
= convert_categorical_Into_1_and_0('clean_subcategories')
lst_subcategories_features = df_rspnse_code_subcategories.category.values
#Permoring normalization on teacher_number_of_previously_posted_projects
normalizer_prev_projects, X_train_prev_projects, X_test_prev_projects \
= encode_numerical_features('teacher_number_of_previously_posted_projects')
#Permoring normalization on price
normalizer_price, X_train_price, X_test_price = encode_numerical_features('price')
lst_train_sentiments = get_essay_sentiment_score(X_train['essay'].values)
lst_test_sentiments = get_essay_sentiment_score(X_test['essay'].values)
#Performing TFIDF feature vectoriztion on essay
tfdif_vectorizer_essay, X_train_essay_tfidf, X_test_essay_tfidf = convert_text_into_TFIDF('essay')
#Performing TFIDF word2vec feature vectoriztion on essay
w2v_vectorizer_essay, X_train_essay_w2v, X_test_essay_w2v = convert_Text_Into_TFIDF_Weighted_W2V('essay')
#Concatinating feature set 1(TFIDF)
X_tr_set1 = hstack((X_train_state_0_1, X_train_tchr_prfx_0_1, X_train_grade_0_1, X_train_categories_0_1,
X_train_subcategories_0_1, X_train_prev_projects, X_train_price, X_train_essay_tfidf,
lst_train_sentiments)).tocsr()
X_te_set1 = hstack((X_test_state_0_1, X_test_tchr_prfx_0_1, X_test_grade_0_1, X_test_categories_0_1,
X_test_subcategories_0_1, X_test_prev_projects, X_test_price, X_test_essay_tfidf,
lst_test_sentiments)).tocsr()
#Concatinating feature set 1(TFIDF Word2Vec)
X_tr_set2 = np.hstack((X_train_state_0_1, X_train_tchr_prfx_0_1, X_train_grade_0_1, X_train_categories_0_1,
X_train_subcategories_0_1, X_train_prev_projects, X_train_price, X_train_essay_w2v,
lst_train_sentiments))
X_te_set2 = np.hstack((X_test_state_0_1, X_test_tchr_prfx_0_1, X_test_grade_0_1, X_test_categories_0_1,
X_test_subcategories_0_1, X_test_prev_projects, X_test_price, X_test_essay_w2v,
lst_test_sentiments))
class CV_Results:
def __init__(self, depth, estimator, acc_score):
self.depth = depth
self.estimator = estimator
self.acc_score = acc_score
def getTrain_and_Cv_scores(grid_search):
lst_train_scores = []
lst_cv_scores = []
idx_score = 0
for i, depth in enumerate(depth_range):
for j, estimator in enumerate(n_estimator):
train_scores = []
cv_scores = []
for k in range(n_folds):
k_fold_train_score = grid_search.cv_results_['split' + str(k) +'_train_score'][idx_score]
k_fold_cv_score = grid_search.cv_results_['split' + str(k) +'_test_score'][idx_score]
train_scores.append(k_fold_train_score)
cv_scores.append(k_fold_cv_score)
idx_score += 1
train_result = CV_Results(depth, estimator, np.mean(np.array(train_scores)))
cv_result = CV_Results(depth, estimator, np.mean(np.array(cv_scores)))
lst_train_scores.append(train_result)
lst_cv_scores.append(cv_result)
return lst_train_scores, lst_cv_scores
def print_train_cv_score(lst_train_scores, lst_cv_scores):
print('\n---------Train scores----------')
for scores in lst_train_scores:
print(f'Depth :{scores.depth} Estimator :{scores.estimator} Auc score:{scores.acc_score}')
print('\n---------CV scores----------')
for scores in lst_cv_scores:
print(f'Depth :{scores.depth} Estimator :{scores.estimator} Auc score:{scores.acc_score}')
def plot_HeatMap(lst_train_scores, lst_cv_scores):
lst_train_cv_scores = [lst_train_scores, lst_cv_scores]
fig = plt.figure(figsize = (15,5))
plot_counter = 1
for data in lst_train_cv_scores:
estimator = []
depth = []
auc_scores = []
for x in data:
estimator.append(x.estimator)
depth.append(x.depth)
auc_scores.append(x.acc_score)
#https://stackoverflow.com/questions/45470882/x-y-z-array-data-to-heatmap/45660022
df = pd.DataFrame.from_dict(np.array([estimator,depth,auc_scores]).T)
df.columns = ['Estimator','Depth','AUC Scores']
df['AUC Scores'] = pd.to_numeric(df['AUC Scores'])
pivotted= df.pivot('Estimator','Depth','AUC Scores')
sns.heatmap(pivotted, ax= fig.add_subplot(1, 2, plot_counter), annot=True, cmap='coolwarm')
if plot_counter == 1:
plt.title('Train AUC Scores for each Hyperparameter')
else:
plt.title('CV AUC Scores for each Hyperparameter')
plot_counter +=1
def plot_AUC(train_fpr, train_tpr, test_fpr, test_tpr, train_auc, test_auc, title):
'''This function plot AUC curve for both train and test FPR and TPR'''
plt.plot(train_fpr, train_tpr, label= f"Train AUC = {train_auc}" )
plt.plot(test_fpr, test_tpr, label = f"Test AUC = {test_auc}")
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.legend()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(title)
plt.grid()
plt.show()
def create_wordcloud_using_fp_essay(X_test_fp_essay):
comment_words = ' '
stopwords = set(STOPWORDS)
# iterate through the csv file
for val in X_test_fp_essay:
# typecaste each val to string
val = str(val)
# split the value
tokens = val.split()
# Converts each token into lowercase
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
for words in tokens:
comment_words = comment_words + words + ' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
# plot the WordCloud image
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
def plot_Box_plot(X_te_fp_price):
sns.boxplot(data= X_te_fp_price)
plt.xlabel("Price")
plt.ylabel("Percentiles")
plt.title("Price vs Likelihood of more datapoints")
plt.show()
def plot_PDF(X_test_fp_tchr_prfx_0, X_test_fp_tchr_prfx_1):
plt.close()
a4_dims = (7, 7)
fig, ax = plt.subplots(figsize=a4_dims)
#https://python-graph-gallery.com/25-histogram-with-several-variables-seaborn/
sns.distplot(X_test_fp_tchr_prfx_0, bins=10, color="skyblue", label="0 Response Code", ax=ax)
sns.distplot(X_test_fp_tchr_prfx_1, bins=10, color="red", label="1 Response Code", ax=ax)
plt.legend()
plt.xlabel("Bins")
plt.ylabel("Likelihood of false positive")
plt.title("PDF plot")
plt.grid()
plt.show();
depth_range = [1, 2, 3, 4, 5]
n_estimator = [70, 80, 90, 100]
param = {'max_depth': depth_range, 'n_estimators': n_estimator}
n_folds = 3
clf = xgb.XGBClassifier()
#Finding best alpha using GridSearchCV method
clf_set1 = GridSearchCV(estimator = clf, param_grid= param, cv=n_folds, scoring='roc_auc')
clf_set1.fit(X_tr_set1, y_train)
clf_set1.best_params_
lst_train_scores_set1, lst_cv_scores_set1 = getTrain_and_Cv_scores(clf_set1)
print_train_cv_score(lst_train_scores_set1, lst_cv_scores_set1)
plot_HeatMap(lst_train_scores_set1, lst_cv_scores_set1)
GBDT_clf_set1 = xgb.XGBClassifier(max_depth= clf_set1.best_params_['max_depth'],
n_estimators= clf_set1.best_params_['n_estimators'])
GBDT_clf_set1.fit(X_tr_set1, y_train)
y_train_proba_set1 = GBDT_clf_set1.predict_proba(X_tr_set1)[:,1]
y_test_proba_set1 = GBDT_clf_set1.predict_proba(X_te_set1)[:,1]
#Finding AUC on train and test data
train_auc_set1 = roc_auc_score(y_train, y_train_proba_set1)
print('Train Auc for set 1')
print(train_auc_set1)
test_auc_set1 = roc_auc_score(y_test, y_test_proba_set1)
print('\n Test Auc for set 1')
print(test_auc_set1)
#Finding FPR and TPR both on train and test
train_fpr_set1, train_tpr_set1, train_threshold_set1 = roc_curve(y_train, y_train_proba_set1)
test_fpr_set1, test_tpr_set1, test_threshold_set1 = roc_curve(y_test, y_test_proba_set1)
#Plotting AUC curve
plot_AUC(train_fpr_set1, train_tpr_set1, test_fpr_set1, test_tpr_set1, train_auc_set1, test_auc_set1,
'ROC curve on Train and Test data for Feature set 1')
#Predicting y_test
y_test_pred_set1 = GBDT_clf_set1.predict(X_te_set1)
confusion_matrix_set1 = confusion_matrix(y_test, y_test_pred_set1)
#Seaborn Heatmap representaion of Train confusion matrix
sns.heatmap(confusion_matrix_set1, annot=True, fmt="d")
#Extracting False positive datapoints
fp_indices = []
for i in range(len(y_test)):
if (np.array(y_test)[i] == 0) & (y_test_pred_set1[i] == 1):
fp_indices.append(i)
#Creating wordcloud from false positive data points of feature essay
X_test_fp_set1 = X_test['essay'][fp_indices]
create_wordcloud_using_fp_essay(X_test_fp_set1)
#Extracting price feature and plotting Box plot
plot_Box_plot(X_test_price[fp_indices])
#Extracting teacher_number_of_previously_posted_projects feature and plotting PDF
X_test_tchr_prfx_0_1_fp = np.array(X_test_tchr_prfx_0_1)[fp_indices]
plot_PDF(X_test_tchr_prfx_0_1_fp[:,0], X_test_tchr_prfx_0_1_fp[:1])
clf = xgb.XGBClassifier()
#Finding best alpha using GridSearchCV method
clf_set2 = GridSearchCV(estimator = clf, param_grid= param, cv=n_folds, scoring='roc_auc')
clf_set2.fit(X_tr_set2, y_train)
clf_set2.best_params_
lst_train_scores_set2, lst_cv_scores_set2 = getTrain_and_Cv_scores(clf_set2)
print_train_cv_score(lst_train_scores_set2, lst_cv_scores_set2)
plot_HeatMap(lst_train_scores_set2, lst_cv_scores_set2)
GBDT_clf_set2 = xgb.XGBClassifier(max_depth= clf_set2.best_params_['max_depth'],
n_estimators= clf_set2.best_params_['n_estimators'])
GBDT_clf_set2.fit(X_tr_set2, y_train)
y_train_proba_set2 = GBDT_clf_set2.predict_proba(X_tr_set2)[:,1]
y_test_proba_set2 = GBDT_clf_set2.predict_proba(X_te_set2)[:,1]
#Finding AUC on train and test data
train_auc_set2 = roc_auc_score(y_train, y_train_proba_set2)
print('Train Auc for set 2')
print(train_auc_set2)
test_auc_set2 = roc_auc_score(y_test, y_test_proba_set2)
print('\n Test Auc for set 2')
print(test_auc_set2)
#Finding FPR and TPR both on train and test
train_fpr_set2, train_tpr_set2, train_threshold_set2 = roc_curve(y_train, y_train_proba_set2)
test_fpr_set2, test_tpr_set2, test_threshold_set2 = roc_curve(y_test, y_test_proba_set2)
#Plotting AUC curve
plot_AUC(train_fpr_set2, train_tpr_set2, test_fpr_set2, test_tpr_set2, train_auc_set2, test_auc_set2,
'ROC curve on Train and Test data for Feature set 2')
#Predicting y_test
y_test_pred_set2 = GBDT_clf_set2.predict(X_te_set2)
confusion_matrix_set2 = confusion_matrix(y_test, y_test_pred_set2)
#Seaborn Heatmap representaion of Train confusion matrix
sns.heatmap(confusion_matrix_set2, annot=True, fmt="d")
#Extracting False positive datapoints
fp_indices = []
for i in range(len(y_test)):
if (np.array(y_test)[i] == 0) & (y_test_pred_set2[i] == 1):
fp_indices.append(i)
#Creating wordcloud from false positive data points of feature essay
X_test_fp_set2 = X_test['essay'][fp_indices]
create_wordcloud_using_fp_essay(X_test_fp_set2)
#Extracting price feature and plotting Box plot
plot_Box_plot(X_test_price[fp_indices])
#Extracting teacher_number_of_previously_posted_projects feature and plotting PDF
X_test_tchr_prfx_0_1_fp = np.array(X_test_tchr_prfx_0_1)[fp_indices]
plot_PDF(X_test_tchr_prfx_0_1_fp[:,0], X_test_tchr_prfx_0_1_fp[:1])
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer", "Model", "Hyper parameter", "AUC"]
x.add_row(["TFIDF", 'GBDT', clf_set1.best_params_, test_auc_set2])
x.add_row(["W2V", 'GBDT', clf_set2.best_params_, test_auc_set1])
print(x)